import os
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
from math import radians, sin, cos, sqrt, atan2
from concurrent.futures import ProcessPoolExecutor, as_completed

# Set the root directory
root_directory = r'P:/Detecting Prime Locations'

# File paths (relative to root_directory)
block_group_file = os.path.join(root_directory, r'_Data/US METRO DATA/BlockData/block_group_coordinates.csv')
pl_file = os.path.join(root_directory, r'_Work/Output/Data/US-CBSAs/PL-data.csv')
output_file = os.path.join(root_directory, r'_Work/TEMP/GISUS_using/BlockGroup2PLdist.csv')

# Haversine distance function
def haversine(lon1, lat1, lon2, lat2):
    R = 6371000  # Earth's radius in meters
    dlon = radians(lon2 - lon1)
    dlat = radians(lat2 - lat1)
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

# Function to compute nearest PL for a subset of block groups
def compute_nearest_for_subset(block_subset, pl_data):
    results = []
    for _, block in block_subset.iterrows():
        block_lat, block_lon = block['intptlat10'], block['intptlon10']
        min_distance = float('inf')
        nearest_plid = None

        for _, pl in pl_data.iterrows():
            pl_lat, pl_lon = pl['PL_lat'], pl['PL_lon']
            distance = haversine(block_lon, block_lat, pl_lon, pl_lat)

            if distance < min_distance:
                min_distance = distance
                nearest_plid = pl['PLID_US']

        # Append the result as a tuple: (block geoid, nearest distance, nearest plid)
        results.append((block['geoid'], min_distance, nearest_plid))
    return results

# Main function to control the execution flow
def main():
    # Load data
    block_df = pd.read_csv(block_group_file)
    pl_df = pd.read_csv(pl_file)

    # Split block groups into subsets for parallel processing
    num_workers = os.cpu_count()  # Number of parallel processes to use
    block_subsets = np.array_split(block_df, num_workers)

    # List to store the final results
    final_results = []

    # Use ProcessPoolExecutor for parallel processing
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        # Submit each subset to the executor
        future_to_subset = {executor.submit(compute_nearest_for_subset, subset, pl_df): subset for subset in block_subsets}

        # Retrieve results as each subset finishes
        for future in as_completed(future_to_subset):
            subset_result = future.result()
            final_results.extend(subset_result)
            print(f"Completed processing subset with {len(subset_result)} block groups.")

    # Convert the final results to a DataFrame
    output_df = pd.DataFrame(final_results, columns=['geoid', 'DistPL', 'plid_us'])

    # Save the output
    output_df.to_csv(output_file, index=False)

    print(f"Output saved to: {output_file}")

# Only run the main function if this is the main module
if __name__ == '__main__':
    main()
